# CSL_coding.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# This file loads and codes compulsory-attendance-law datasets from 
# Acemoglu and Angrist (2001) and Goldin and Katz (2003). It also loads 
# a new attendance-law dataset compiled by Matthew Bettinger, Mary McGrath, 
# Celia Paris, Lauren Sexton, and me. These datasets are merged and saved in
# 'CSLdata.RData', which is then loaded by other files in this replication 
# archive.

library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))    # for %IN%, qw()
library(car)        # for Recode()
library(dataverse)  # for get_file()
library(dplyr)      # for rename()
library(forcats)    # for fct_expand(), fct_relevel()
library(haven)      # for read_dta()
library(RCurl)      # for reading in the modern CSL data from my Google sheet
library(tidyr)      # for fill()



##############################################################################
# LOAD THE ACEMOGLU-ANGRIST DATASET (1914-1978)
##############################################################################
# Download the dataset from Angrist's Dataverse repository 
# (https://doi.org/10.7910/DVN/ZKEY9J).
CSL_AA <- tempfile(fileext = '.RData')
get_file(
  file    = 'impute3.tab',
  dataset = 'doi:10.7910/DVN/ZKEY9J',
  format  = 'RData') %>%
  writeBin(., CSL_AA)
load(CSL_AA)
CSL_AA <- x
rm(x)



##############################################################################
# LOAD GOLDIN-KATZ DATASET (1910-1939)
##############################################################################
# Different authors' datasets have different names for the same variables.  
# Here, I rename some of the variables so that they have the Acemoglu-Angrist 
# variable names. This will make the coding easier later on.
CSL_GK <- tempfile(fileext = '.zip')
download.file(
  url      = 'http://scholar.harvard.edu/files/goldin/files/stata_claudia.zip',
  destfile = CSL_GK)
CSL_GK <- unz(CSL_GK, 'STATA Claudia.dta') %>% 
  read_dta() %>%
  rename(
    min      = ageent, 
    max      = agelev, 
    minexpt  = edexcg, 
    minawork = labage, 
    minework = labschl) %>%
  mutate(
    state = factor(state) %>% 
      fct_expand(state, 'DC')) %>% 
  mutate(state = fct_relevel(state, levels(state)[1:6], "DC"))  # put DC in alphabetical order



##############################################################################
# LOAD NEW DATA COLLECTED FOR THIS PROJECT (1979-2010)
##############################################################################
# The new dataset contains information on attendance laws that were in effect
# from 1979 through 2010.  
options(RCurlOptions = list(
  capath         = system.file("CurlSSL", "cacert.pem", package = "RCurl"), 
  ssl.verifypeer = FALSE))
CSV_new <- getURL('https://docs.google.com/spreadsheets/d/1HaklrElBcIQaXgsU7xE7Qu2OQrALqOhS-IvMmLY7hfQ/export?format=csv&id=1HaklrElBcIQaXgsU7xE7Qu2OQrALqOhS-IvMmLY7hfQ&gid=0')
CSV_new_cnxn       <- textConnection(CSV_new)
CSL_new            <- read.csv(
  file      = CSV_new_cnxn, 
  col.names = c(
    'state', 'year', 'enroll_age', 'drop_age', 'req_sch', 'CA_source', 
    'CA_notes', 'work_age', 'work_sch', 'CL_source', 'CL_notes'))  
close(CSV_new_cnxn)
CSL_new <- CSL_new %>%  
  select(-c(CA_source, CA_notes, CL_notes, CL_source)) %>%
  filter(! state %in% qw('Alaska Hawaii')) %>%
  mutate(state = droplevels(state))
CSL_new$enroll_age <- as.integer(
  Recode(
    var       = CSL_new$enroll_age, 
    recodes   = 'c("---", "")=NA; "NR"=0', 
    as.factor = FALSE))
CSL_new$drop_age   <- as.integer(
  Recode(
    var       = CSL_new$drop_age,   
    recodes   = 'c("---", "", "FIND THIS OUT")=NA; "NR"=0', 
    as.factor = FALSE))



##############################################################################
# CODE SOME NEW VARIABLES
##############################################################################
# These codings follow directly from Acemoglu and Angrist (2001, 55). The 
# rules mentioned in the comments below are from that page of their article.
CSL_AA$req_sch  <- car::Recode(CSL_AA$minexpt,  '"NR"=0', as.factor=FALSE)  # Rule 1; minexpt = min. schooling to dropout, i.e., "req_sch"
CSL_AA$work_age <- car::Recode(CSL_AA$minawork, '"NR"=0', as.factor=FALSE)  # Rule 4
CSL_AA$work_sch <- car::Recode(CSL_AA$minework, '"NR"=0', as.factor=FALSE)  # Rule 5
CSL_GK$req_sch  <- car::Recode(CSL_GK$minexpt,    '99=0', as.factor=FALSE)  # Rule 1; minexpt = min. schooling to dropout, i.e., "req_sch"
CSL_GK$work_age <- car::Recode(CSL_GK$minawork,   '99=0', as.factor=FALSE)  # Rule 4
CSL_GK$work_sch <- car::Recode(CSL_GK$minework,   '99=0', as.factor=FALSE)  # Rule 5

# These recodings don't follow directly from Acemoglu and Angrist (2001, 55),
# but they seem sensible.
CSL_AA$drop_age   <- car::Recode(CSL_AA$max, '"NR"=0', as.factor = FALSE)  
CSL_AA$enroll_age <- car::Recode(CSL_AA$min, '"NR"=0', as.factor = FALSE)  
CSL_GK$drop_age   <- car::Recode(CSL_GK$max, '"NR"=0', as.factor = FALSE)  
CSL_GK$enroll_age <- car::Recode(CSL_GK$min, '"NR"=0', as.factor = FALSE)  



##############################################################################
# CORRECT THE ACEMOGLU-ANGRIST DATA
##############################################################################
# See the appendix of the article for details. There are a few further 
# corrections at the end of this file.  

CSL_AA$minexpt[CSL_AA$pob=='Alabama' & CSL_AA$year>=1950] <- 8
CSL_AA$req_sch[CSL_AA$pob=='Alabama' & CSL_AA$year>=1950] <- 8

CSL_AA$minexpt[CSL_AA$pob=='Georgia'     & CSL_AA$year>=1946] <- 7
CSL_AA$req_sch[CSL_AA$pob=='Georgia'     & CSL_AA$year>=1946] <- 7

CSL_AA$req_sch[CSL_AA$pob=='Kentucky' & CSL_AA$year>=1935] <- 0

CSL_AA$minexpt[CSL_AA$pob=='Illinois'    & CSL_AA$year==1978] <- 'NR'
CSL_AA$req_sch[CSL_AA$pob=='Illinois'    & CSL_AA$year==1978] <- 0

CSL_AA$enroll_age[CSL_AA$pob=='Massachu' & CSL_AA$year>=1965] <- 7



#######################################################################
# ADD TWO-LETTER STATE ABBREVIATIONS TO THE DATASETS
#######################################################################
CSL_AA$state <- car::Recode(
  var     = CSL_AA$pob, 
  recodes = '"Connecti"="CT"; "Maine"="ME"; "Massachu"="MA"; "NewHamps"="NH"; 
             "RhodeIsl"="RI"; "Vermont"="VT"; "Delaware"="DE"; "NewJerse"="NJ"; 
             "NewYork"="NY";  "Pennsylv"="PA"; "Illinois"="IL"; "Indiana"="IN"; 
             "Michigan"="MI"; "Ohio"="OH"; "Wisconsi"="WI"; "Iowa"="IA"; 
             "Kansas"="KS";   "Minnesot"="MN"; "Missouri"="MO"; "Nebraska"="NE"; 
             "NorthDak"="ND"; "SouthDak"="SD"; "Virginia"="VA"; "Alabama"="AL";
             "Arkansas"="AR"; "Florida"="FL"; "Georgia"="GA";  "Louisian"="LA"; 
             "Mississi"="MS"; "NorthCar"="NC"; "SouthCar"="SC"; "Texas"="TX"; 
             "Kentucky"="KY"; "Maryland"="MD"; "Oklahoma"="OK"; "Tennesse"="TN"; 
             "WestVirg"="WV"; "Arizona"="AZ"; "Colorado"="CO"; "Idaho"="ID"; 
             "Montana"="MT";  "Nevada"="NV"; "NewMexic"="NM"; "Utah"="UT"; 
             "Wyoming"="WY";  "Californ"="CA"; "Oregon"="OR";   "Washingt"="WA"')

         
CSL_new$state.2L <- setNames(state.abb, state.name)[as.character(CSL_new$state)]         
CSL_new$state.2L[CSL_new$state == 'DC']         <- 'DC'
CSL_new$state.2L[CSL_new$state == 'Tennessee '] <- 'TN'  # account for the trailing space
CSL_new$state.2L <- factor(CSL_new$state.2L)



#######################################################################
# MERGE THE DATASETS
#######################################################################
CSLdata <- CSL_AA[, qw('state year enroll_age drop_age req_sch work_age work_sch')]

# ADD GOLDIN-KATZ DATA
for (i in levels(CSLdata$state)) {
  rowsToAdd <- which(CSL_GK$state==i & !CSL_GK$year%IN%CSLdata$year[CSLdata$state==i])
  CSLdata   <- bind_rows(CSLdata, CSL_GK[rowsToAdd, qw('state year enroll_age drop_age req_sch work_age work_sch')])
}

# ADD THE NEW DATA
CSL_new <- CSL_new %>%
  filter(year > 1978) %>%
  select(-state) %>%
  rename(state = state.2L)
CSLdata <- bind_rows(CSLdata, CSL_new) %>%
  arrange(state, year)



##################################################################################
# IMPUTE MISSING VALUES OF KEY VARIABLES FOR 1979 THROUGH THE PRESENT 
##################################################################################
# We group by state before calling fill() so that, if a given variable is NA
# for the first year in a given state, we won't fill in the NA with a value 
# from the last year of the preceding state. 
CSLdata <- CSLdata %>%
  group_by(state) %>%
  fill(everything())



##############################################################################
# CREATE CA AND CL VARIABLES FOR THE MERGED DATASET 
##############################################################################

# CREATE CA (COMPULSORY ATTENDANCE) VARIABLE
CSLdata$CAdiff <- CSLdata$drop_age - CSLdata$enroll_age
CSLdata$CA     <- ifelse(
  test = CSLdata$enroll_age==0 | CSLdata$drop_age==0, 
  yes  = pmax(0, CSLdata$req_sch), 
  no   = pmax(CSLdata$CAdiff, CSLdata$req_sch))   # Rules 2 and 3   

# CREATE CL (CHILD LABOR) VARIABLE
# We need this variable for the comparison of different instrument sets that 
# is reported in the appendix.
CSLdata$CLdiff <- CSLdata$work_age - CSLdata$enroll_age
CSLdata$CL     <- ifelse(
  test = CSLdata$enroll_age==0, 
  yes  = pmax(0, CSLdata$work_sch), 
  no   = pmax(CSLdata$CLdiff, CSLdata$work_sch))  # Rules 6 and 7



##############################################################################
# MAKE FURTHER CORRECTIONS TO THE CSL DATA
##############################################################################
# See the appendix for details. On California, see also the notes in the
# Google spreadsheet of CSL data. 
CSLdata[CSLdata$state == 'SC' & CSLdata$year %in% 1956:1958, qw("CA CL")] <- 0
CSLdata[CSLdata$state == 'MS' & CSLdata$year %in% 1957:1958, qw("CA CL")] <- 0
CSLdata[CSLdata$state == 'CA' & CSLdata$year %in% 1977:1987, 'CA'] <- 10



##############################################################################
# EXAMINE THRESHOLD CROSSINGS
##############################################################################
# The instruments that I use are a "moderate" dummy for states requiring 
# 8-10 years of school, and a "strict" dummy for states requiring more than 10
# years. What proportion of changes in schooling laws involved a change from 
# one of these categories to another?
#  I report the answer in an appendix footnote: "fully XX% of all changes 
# made..."
if (interactive()) {
  tmp <- CSLdata %>%
    group_by(state) %>%
    mutate(
      CA_change      = (CA != lag(CA)),  # did CA change at all from year to year?
      CA_crossMod    = (lag(CA)<8  & CA>=8)  | (lag(CA)>=8 & CA<8),
      CA_crossStrict = (lag(CA)<11 & CA>=11) | (lag(CA)>=11 & CA<11),
  )
  table(tmp$CA_change)
  table(tmp$CA_crossMod)
  table(tmp$CA_crossStrict)
  table(tmp$CA_crossMod | tmp$CA_crossStrict)
  sum(tmp$CA_crossMod | tmp$CA_crossStrict, na.rm = TRUE) / sum(tmp$CA_change, na.rm = TRUE)
  rm(tmp)
}



##############################################################################
# SAVE CSL DATA AND CLEAN UP
##############################################################################
rm(CSL_AA, CSL_GK, CSL_new, CSV_new_cnxn, rowsToAdd)
